EN conditionLBA conditionlibrary(mixOmics)
library(metagenomeSeq)
library(reshape2)
library(ggplot2)
df_abundances <- read.delim("../data/abondances.csv", sep = ",",
stringsAsFactors = FALSE)
df_abundances[df_abundances[ ,1] == "&",1] <- "Bacteria;Proteobacteria;Gammaproteobacteria;Pasteurellales;Pasteurellaceae;Mannheimia;Mannheimia haemolytica"
abundances <- df_abundances[ ,grep("A_", colnames(df_abundances))] +
df_abundances[ ,grep("B_", colnames(df_abundances))]
dim(abundances)
## [1] 406 45
condition <- rep("LBA", ncol(abundances))
condition[grep("EN", colnames(abundances))] <- "EN"
condition <- factor(condition)
table(condition)
## condition
## EN LBA
## 22 23
id_abundances <- as.character(colnames(abundances))
id_abundances <- sapply(id_abundances, function(ac)
substr(ac, nchar(ac) - 19, nchar(ac) - 18))
id_abundances <- gsub("A", "0", id_abundances)
id_abundances <- gsub("N", "0", id_abundances)
table(id_abundances)
## id_abundances
## 01 03 04 06 07 09 10 11 15 16 17 18 19 20 21 23 24 25 26 28 29 30 31
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2
species <- sapply(df_abundances[ ,1], function(aname)
unlist(strsplit(aname, ";")), simplify = FALSE)
species <- sapply(species, function(avect) {
find_unknown <- grep("unknown", avect)
if (length(find_unknown) > 0) {
return(avect[-find_unknown])
} else return(avect)
})
species <- unlist(sapply(species, function(avect) avect[length(avect)]))
species <- unname(species)
abundances <- apply(abundances, 2, function(acol) tapply(acol, species, sum))
species <- rownames(abundances)
colnames(abundances) <- paste(condition,id_abundances,sep='_')
df_pathogenes <- read.delim("../data/pathogenes.csv", sep = ",")
dim(df_pathogenes)
## [1] 46 9
pathogenes <- df_pathogenes[ ,-c(1,9)]
pathogenes <- as.data.frame(ifelse(pathogenes == "p", 1, 0))
pathogenes <- as.data.frame(apply(t(pathogenes), 1, as.factor))
id_pathogenes <- df_pathogenes[ ,1]
id_pathogenes <- gsub("-", "0", id_pathogenes)
id_pathogenes <- gsub(" ", "", id_pathogenes)
id_pathogenes <- sapply(as.character(id_pathogenes), function(aname) {
substr(aname, nchar(aname)-1, nchar(aname))
})
df_pathogenes[ ,9] <- ifelse(df_pathogenes[ ,9] == "ENP", "EN", "LBA")
id_pathogenes <- paste0(id_pathogenes, "_", df_pathogenes[ ,9])
The list of farm names is not in the same order than in the abundance file. Both ordering are matched according to the abundance order (one sample is thus excluded from the analysis because the abundance data are not available):
pathogenes <- pathogenes[match(paste0(id_abundances, "_", condition),
id_pathogenes), ]
rownames(pathogenes) <- paste0(id_abundances, "_", condition)
df_abundances_ctrl <- read.delim("../data/abondances-ctrl.csv", sep = ",",
stringsAsFactors = FALSE)
summary(df_abundances_ctrl[ ,1:15])
## X.blast_taxonomy blast_subject blast_perc_identity
## Length:293 Length:293 Min. : 98.08
## Class :character Class :character 1st Qu.: 99.58
## Mode :character Mode :character Median :100.00
## Mean : 99.72
## 3rd Qu.:100.00
## Max. :100.00
## blast_perc_query_coverage blast_evalue blast_aln_length
## Min. : 96.66 Min. :0 Min. :430.0
## 1st Qu.:100.00 1st Qu.:0 1st Qu.:466.0
## Median :100.00 Median :0 Median :479.0
## Mean : 99.97 Mean :0 Mean :475.4
## 3rd Qu.:100.00 3rd Qu.:0 3rd Qu.:490.0
## Max. :100.00 Max. :0 Max. :512.0
## seed_id observation_name observation_sum
## Length:293 Length:293 Min. : 0
## Class :character Class :character 1st Qu.: 57
## Mode :character Mode :character Median : 137
## Mean : 1014
## 3rd Qu.: 364
## Max. :97278
## NG.13669_IDV1EN_lib206978_5685_2 NG.13669_IDV1LBA_lib206979_5685_2
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 2.00 Median : 0.00
## Mean : 84.53 Mean : 84.53
## 3rd Qu.: 18.00 3rd Qu.: 0.00
## Max. :11964.00 Max. :14102.00
## NG.13669_IDV2EN_lib206980_5685_2 NG.13669_IDV2LBA_lib206981_5685_2
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 4.00 Median : 0.00
## Mean : 84.53 Mean : 84.53
## 3rd Qu.: 40.00 3rd Qu.: 0.00
## Max. :2946.00 Max. :10748.00
## NG.13669_S2EN_lib206984_5685_2 NG.13669_S2LBA_lib206985_5685_2
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 2.00 Median : 0.00
## Mean : 84.53 Mean : 84.53
## 3rd Qu.: 15.00 3rd Qu.: 0.00
## Max. :8065.00 Max. :6340.00
which_selected <- grep("LBA_|EN_", colnames(df_abundances_ctrl))
abundances_ctrl <- df_abundances_ctrl[ ,which_selected]
dim(abundances_ctrl)
## [1] 293 12
condition_ctrl <- rep("LBA", ncol(abundances_ctrl))
condition_ctrl[grep("EN", colnames(abundances_ctrl))] <- "EN"
condition_ctrl <- factor(condition_ctrl)
table(condition_ctrl)
## condition_ctrl
## EN LBA
## 6 6
id_abundances_ctrl=rep(1:6, each = 2)
colnames(abundances_ctrl)=paste(condition_ctrl, id_abundances_ctrl, sep = "_")
Unique species are extracted similarly as for case samples:
species_ctrl <- as.data.frame(sapply(df_abundances_ctrl[ ,1], function(aname)
unlist(strsplit(aname, ";"))))
species_ctrl <- sapply(species_ctrl, function(avect) {
find_unknown <- grep("unknown", avect)
if (length(find_unknown) > 0) {
return(avect[-find_unknown])
} else return(avect)
})
species_ctrl <- unlist(sapply(species_ctrl, function(avect) avect[length(avect)]))
species_ctrl <- unname(species_ctrl)
species_ctrl <- sapply(species_ctrl, as.character)
abundances_ctrl <- apply(abundances_ctrl, 2, function(acol) tapply(acol, species_ctrl, sum))
species_ctrl <- rownames(abundances_ctrl)
In this first part of the exploratory analyses, a comparison of case and control samples is performed in terms of total count and variety of species.
df <- data.frame("total" = c(apply(abundances, 2, sum),
apply(abundances_ctrl, 2, sum)),
"condition" = c(condition, condition_ctrl),
"type" = rep(c("case", "control"),
c(ncol(abundances), ncol(abundances_ctrl))),
"sample" = c(paste("case", condition, id_abundances, sep = "_"),
paste("ctrl", colnames(abundances_ctrl), sep = "_")))
p <- ggplot(df, aes(x = sample, weight = total, fill = type)) + geom_bar() +
theme_bw() + ylab("total count") + xlab("samples") +
theme(axis.text.x = element_blank())
p
Control samples seem to have been badly normalized with two samples with a much larger total count. Moreover, the total count between case and control samples is very different, with a much smaller total count for control samples.
plot.new()
library(VennDiagram)
## Loading required package: grid
## Loading required package: futile.logger
vd <- venn.diagram(list("case" = species, "control" = species_ctrl),
fill = brewer.pal(3, "Set2")[1:2],
cat.col = "black", cat.cex = 1.8,cex = 1.7,cat.dist=-0.05,
fontface = "bold", filename = NULL)
grid.draw(vd)
## null device
## 1
## [1] TRUE
Case and control samples only have 117 bacteria in common. Extracting count data with only this common set of bacteria lead to the following comparison in terms of total count:
common_species <- intersect(species, species_ctrl)
write.table(common_species, file = "../data/common_species.txt",
row.names = FALSE, col.names = FALSE)
common_case <- abundances[common_species, ]
common_ctrl <- abundances_ctrl[common_species, ]
df <- data.frame("total" = c(apply(common_case, 2, sum),
apply(common_ctrl, 2, sum)),
"condition" = c(condition, condition_ctrl),
"type" = rep(c("case", "control"),
c(ncol(abundances), ncol(abundances_ctrl))),
"sample" = c(paste("case", condition, id_abundances, sep = "_"),
paste("ctrl", colnames(abundances_ctrl), sep = "_")))
p <- ggplot(df, aes(x = sample, weight = total, fill = type)) + geom_bar() +
theme_bw(base_size = 15) + ylab("total count") + xlab("samples") +
theme(axis.text.x = element_blank())
p
that shows an even larger differences between total counts when only common species are selected.
The first part of the analysis aims at finding signatures that are specific to EN / LBA samples. This is performed using two types of preprocessing and paired or unpaired analyses. The used method is sparse PLS-DA.
A first PLS-DA is computed (with 10-fold CV) to check the efficiency of the method and which type of distance to use in its computation.
set.seed(11)
res_plsda <- plsda(log(t(abundances)+1), condition, ncomp = nlevels(condition))
res_perf <- perf(res_plsda, validation = 'Mfold', folds = 10,
progressBar = FALSE, nrepeat = 100)
plot(res_perf, overlay = 'measure', sd = TRUE)
plotIndiv(res_plsda , comp = c(1, 2), ind.names = FALSE, ellipse = TRUE,
legend = TRUE, title = 'PLS-DA Comp 1 - 2',
size.title = rel(2),
size.ylabel = rel(1.5),
size.xlabel = rel(1.5),
size.legend.title = rel(1.5))
PLS-DA shows a good separation between the two groups and indicates that the Mahalanobis distance provides the lower overall classification error.
Then, sparse PLS-DA is used (with the multilevel approach) to check which number of components to select.
clean_log <- data.frame(log(t(abundances[ ,id_abundances != "29"]) + 1))
names(clean_log) <- species
clean_condition <- factor(condition[id_abundances != "29"])
clean_id <- factor(id_abundances[id_abundances != "29"])
set.seed(33)
res_plsda <- tune.splsda(clean_log, clean_condition,
ncomp = nlevels(clean_condition),
multilevel = clean_id,
test.keepX = 1:20, validation = 'Mfold', folds = 10,
dist = 'mahalanobis.dist', nrepeat = 10,
progressBar = FALSE)
## Splitting the variation for 1 level factor.
plot(res_plsda)
sel_keepX <- res_plsda$choice.keepX[1:2]
sel_keepX
## comp1 comp2
## 5 14
res_splsda <- splsda(clean_log, clean_condition,
ncomp = nlevels(clean_condition), multilevel = clean_id,
keepX = sel_keepX)
## Splitting the variation for 1 level factor.
plotIndiv(res_splsda, comp = c(1,2), ind.names = FALSE, ellipse = TRUE,
legend = TRUE, title = 'sPLS-DA Comp 1 - 2',
size.title = rel(2),
size.ylabel = rel(1.5),
size.xlabel = rel(1.5),
size.legend.title = rel(1.5))
head(selectVar(res_splsda, comp = 1)$value)
## value.var
## Mycoplasma bovoculi M165/69 -0.6975800
## Corynebacterium camporealensis -0.5978130
## Moraxella -0.3111287
## Aquamicrobium -0.1832142
## Peptoclostridium -0.1601039
par(mfrow=c(1,2))
plotLoadings(res_splsda, comp = 1, method = 'mean', contrib = 'max',
size.title = 1.3, size.xlabel = rel(1.5),legend = F)
plotLoadings(res_splsda, comp = 2, method = 'mean', contrib = 'max',
size.title = 1.3,legend = F)
par(mfrow=c(1,1))
In this section two complementary analyses are performed: a Student test to assess whether the found bacteria are differentially abundant between the two conditions and boxplots to visualize the abundance differences.
selected <- selectVar(res_splsda, comp = 1)$name
p<-list()
for (ind in seq_along(selected)) {
df <- data.frame(counts = abundances[selected[ind], ],
condition = condition)
p[[ind]] <- ggplot(df, aes(x = condition, y = counts + 1, fill = condition)) +
geom_boxplot() + theme_bw(base_size = 13) + scale_y_log10() +
scale_fill_manual(values = c("dodgerblue2","darkorange")) +
ggtitle(selected[ind]) + ylab(expression(log[10] ~ "(count + 1)"))
#print(p[ind])
}
multiplot(plotlist =p,cols=2)
all_pvals <- sapply(seq_along(selected), function(ind) {
var_en <- clean_log[clean_condition == "EN",selected[ind]]
var_lba <- clean_log[clean_condition == "LBA",selected[ind]]
pval <- t.test(var_en, var_lba, paired = TRUE)$p.value
})
res <- data.frame(bacteria = selected, pvalue = all_pvals,
FDR = p.adjust(all_pvals, method = "BH"))
res
## bacteria pvalue FDR
## 1 Mycoplasma bovoculi M165/69 3.728266e-12 1.553412e-11
## 2 Corynebacterium camporealensis 6.213648e-12 1.553412e-11
## 3 Moraxella 2.367632e-11 3.946053e-11
## 4 Aquamicrobium 4.077402e-11 4.484280e-11
## 5 Peptoclostridium 4.484280e-11 4.484280e-11
All tests are found positive: those bacteria are found differentially present in EN/LBA samples.
Moreover, among the selected species, those that are also found in the control samples are:
also_in_common <- intersect(selected, common_species)
also_in_common
## [1] "Corynebacterium camporealensis" "Aquamicrobium"
## [3] "Peptoclostridium"
For those species the comparison of the total count and relative abundance are provided in the following boxplots:
#p <- list()
for (ind in seq_along(also_in_common)) {
df <- data.frame(counts = c(abundances[also_in_common[ind], ],
abundances_ctrl[also_in_common[ind], ]),
condition = factor(c(as.character(condition),
as.character(condition_ctrl))),
type = rep(c("case", "control"),
c(ncol(abundances), ncol(abundances_ctrl))))
p <- ggplot(df, aes(x = condition, y = counts + 1, fill = condition)) +
geom_boxplot() + theme_bw(base_size = 15) + scale_y_log10() +
scale_fill_manual(values = c("dodgerblue2", "darkorange")) +
ggtitle(selected[ind]) + ylab(expression(log[10] ~ "(count + 1)")) +
facet_grid( ~ type)
print(p)
}
#multiplot(plotlist =p,cols=3)
That shows that, exept for Micoplasma bovoculi, the differences between LBA and EN (in terms of counts) is not related to the status (case/control) of the sample.
case_counts <- apply(abundances, 2, sum)
ctrl_counts <- apply(abundances_ctrl, 2, sum)
for (ind in seq_along(also_in_common)) {
freq_case <- abundances[also_in_common[ind], ] / case_counts
freq_ctrl <- abundances_ctrl[also_in_common[ind], ] / ctrl_counts
df <- data.frame(counts = c(freq_case, freq_ctrl),
condition = factor(c(as.character(condition),
as.character(condition_ctrl))),
type = rep(c("case", "control"),
c(ncol(abundances), ncol(abundances_ctrl))))
p <- ggplot(df, aes(x = condition, y = counts, fill = condition)) +
geom_boxplot() + theme_bw(base_size = 15) +
scale_fill_manual(values = c("dodgerblue2", "darkorange")) +
ggtitle(selected[ind]) + ylab("relative abundance") +
facet_grid( ~ type)
print(p)
}
that confirms the previous conclusion.
The same analysis is performed without scaling the data.
set.seed(11)
res_plsda <- plsda(log(t(abundances)+1), condition, ncomp = nlevels(condition),
scale = FALSE)
res_perf <- perf(res_plsda, validation = 'Mfold', folds = 5, # erreur si folds=10
progressBar = FALSE, nrepeat = 10)
plot(res_perf, overlay = 'measure', sd = TRUE)
plotIndiv(res_plsda , comp = c(1, 2), ind.names = FALSE, ellipse = TRUE,
legend = TRUE, title = 'PLS-DA Comp 1 - 2')
PLS-DA shows a good separation between the two groups and indicates that the Mahalanobis distance provides the lower overall classification error.
Then, sparse PLS-DA is used (with the multilevel approach) to check which number of components to select.
clean_log <- data.frame(log(t(abundances[ ,id_abundances != "29"]) + 1))
names(clean_log) <- species
clean_condition <- factor(condition[id_abundances != "29"])
clean_id <- factor(id_abundances[id_abundances != "29"])
set.seed(33)
res_plsda <- tune.splsda(clean_log, clean_condition,
ncomp = nlevels(clean_condition),
multilevel = clean_id,
test.keepX = 1:20, validation = 'Mfold', folds = 10,
dist = 'mahalanobis.dist', nrepeat = 10,
progressBar = FALSE, scale = FALSE)
## Splitting the variation for 1 level factor.
plot(res_plsda)
sel_keepX <- res_plsda$choice.keepX[1:2]
sel_keepX
## comp1 comp2
## 4 1
Finally sparse PLS-DA is performed and the variables explaining the two types of samples are obtained:
res_splsda <- splsda(clean_log, clean_condition,
ncomp = nlevels(clean_condition), multilevel = clean_id,
keepX = sel_keepX, scale = FALSE)
## Splitting the variation for 1 level factor.
plotIndiv(res_splsda, comp = c(1,2), ind.names = FALSE, ellipse = TRUE,
legend = TRUE, title = 'sPLS-DA Comp 1 - 2')
head(selectVar(res_splsda, comp = 1)$value)
## value.var
## Streptococcus pluranimalium -0.61298859
## Mycoplasma bovoculi M165/69 -0.56123953
## Moraxella -0.54756384
## Mannheimia sp. -0.09710312
plotLoadings(res_splsda, comp = 1, method = 'mean', contrib = 'max',
size.title = 1)
In this section two complementary analyses are performed: a Student test to assess whether the found bacteria are differentially abundant between the two conditions and boxplots to visualize the abundance differences.
selected <- selectVar(res_splsda, comp = 1)$name
for (ind in seq_along(selected)) {
df <- data.frame(counts = abundances[selected[ind], ],
condition = condition)
p <- ggplot(df, aes(x = condition, y = counts + 1, fill = condition)) +
geom_boxplot() + theme_bw() + scale_y_log10() +
scale_fill_manual(values = c("dodgerblue2","darkorange")) +
ggtitle(selected[ind]) + ylab(expression(log[10] ~ "(count + 1)"))
print(p)
}
all_pvals <- sapply(seq_along(selected), function(ind) {
var_en <- clean_log[clean_condition == "EN",selected[ind]]
var_lba <- clean_log[clean_condition == "LBA",selected[ind]]
pval <- t.test(var_en, var_lba, paired = TRUE)$p.value
})
res <- data.frame(bacteria = selected, pvalue = all_pvals,
FDR = p.adjust(all_pvals, method = "BH"))
res
## bacteria pvalue FDR
## 1 Streptococcus pluranimalium 3.738224e-08 3.738224e-08
## 2 Mycoplasma bovoculi M165/69 3.728266e-12 1.491306e-11
## 3 Moraxella 2.367632e-11 4.735264e-11
## 4 Mannheimia sp. 4.288368e-09 5.717824e-09
This section tries to make a link between abundance dataset and virus presence. It is divided into two parts: the first one performs a multivariate analysis relating the presence or absence of any virus to the abundance data. The second one aims at predicting a specific group of three viruses from abundance data.
In the sequel, the presence/absence of viruses is predicted from abundance data. Since most virus presence is very rare (with highly unbalanced datasets), a group of three viruses RSV, PI.3, Coronavirus is defined and used as a target for prediction.
The factor corresponding to these three virus presence is thus defined:
group_virus <- apply(sapply(pathogenes, as.numeric) - 1, 2, as.logical)
group_virus <- (group_virus[ ,1]) | (group_virus[ ,2]) | (group_virus[ ,3])
group_virus <- as.factor(as.numeric(group_virus))
table(group_virus)
## group_virus
## 0 1
## 18 27
sel_samples <- which(condition == "EN")
set.seed(11)
res_plsda <- plsda(X = log(t(abundances)+1)[sel_samples, ],
Y = group_virus[sel_samples],
ncomp = nlevels(group_virus))
## Warning in cor(A[[k]], variates.A[[k]]): l'écart type est nulle
res_perf <- perf(res_plsda, validation = 'Mfold', folds = 5,
progressBar = FALSE, nrepeat = 10)
plot(res_perf, overlay = 'measure', sd = TRUE)
plotIndiv(res_plsda , comp = c(1, 2), ind.names = FALSE, ellipse = TRUE,
legend = TRUE, title = 'PLS-DA (RSV, PI.3 and Coronavirus) - EN',
size.title = rel(2),size.xlabel = rel(1.5),size.ylabel = rel(1.5),
size.legend = rel(1))
set.seed(33)
res_plsda <- tune.splsda(X = log(t(abundances)+1)[sel_samples, ],
Y = group_virus[sel_samples],
ncomp = nlevels(group_virus),
test.keepX = 1:20, validation = 'Mfold',
folds = 5, dist = 'mahalanobis.dist', nrepeat = 10,
progressBar = FALSE)
plot(res_plsda)
sel_keepX <- res_plsda$choice.keepX[1:2]
sel_keepX
## comp1 comp2
## 2 6
res_splsda <- splsda(X = log(t(abundances)+1)[sel_samples, ],
Y = group_virus[sel_samples], ncomp = nlevels(group_virus),
keepX = c(20,20)) ## using results from "nrepeat=100"
## Warning in cor(A[[k]], variates.A[[k]]): l'écart type est nulle
plotIndiv(res_splsda, comp = c(1,2), ind.names = FALSE, ellipse = TRUE,
legend = TRUE, title = 'sPLS-DA (RSV, PI.3 and Coronavirus) - EN',
size.title = rel(2),size.xlabel = rel(1.5),size.ylabel = rel(1.5),
size.legend = rel(1))
selectVar(res_splsda, comp = 1)$name
## [1] "Leucobacter" "Corynebacterium confusum"
## [3] "Pseudochrobactrum" "Gelidibacter"
## [5] "Streptococcus gallolyticus" "Moraxella lacunata"
## [7] "Gracilibacteria" "Ochrobactrum sp."
## [9] "Arthrobacter sp." "Trueperella pyogenes"
## [11] "Faecalicoccus pleomorphus" "Phyllobacteriaceae"
## [13] "Propionibacteriaceae" "Helcococcus ovis"
## [15] "Sporosarcina sp." "Olivibacter"
## [17] "Chryseobacterium sp." "Delftia sp."
## [19] "Mycoplasma bovoculi M165/69" "Lachnoclostridium"
plotLoadings(res_splsda, comp = 1, method = 'mean', contrib = 'max',
size.title = rel(1.5),size.axis = rel(1.5),size.legend = rel(1))
selected <- selectVar(res_splsda, comp = 1)$name
p <- list()
for (ind in seq_along(selected[1:10])) {
df <- data.frame(counts = abundances[selected[ind],sel_samples],
condition = group_virus[sel_samples])
p[[ind]] <- ggplot(df, aes(x = condition, y = counts + 1, fill = condition)) +
geom_boxplot() + theme_bw() + scale_y_log10() +
scale_fill_manual(values = c("dodgerblue2","darkorange")) +
ggtitle(selected[ind]) + ylab(expression(log[10] ~ "(count + 1)"))
#print(p)
}
multiplot(plotlist=p,cols=2)
all_pvals <- sapply(seq_along(selected), function(ind) {
all_data <- log(abundances[selected[ind],sel_samples] + 1)
pval <- t.test(all_data ~ group_virus[sel_samples])$p.value
})
res <- data.frame(bacteria = selected, pvalue = all_pvals,
FDR = p.adjust(all_pvals, method = "BH"))
res
## bacteria pvalue FDR
## 1 Leucobacter 0.03481991 0.08704976
## 2 Corynebacterium confusum 0.03045623 0.08704976
## 3 Pseudochrobactrum 0.03477661 0.08704976
## 4 Gelidibacter 0.01618017 0.08704976
## 5 Streptococcus gallolyticus 0.02026699 0.08704976
## 6 Moraxella lacunata 0.01911170 0.08704976
## 7 Gracilibacteria 0.02123296 0.08704976
## 8 Ochrobactrum sp. 0.07461385 0.12304604
## 9 Arthrobacter sp. 0.09038226 0.12304604
## 10 Trueperella pyogenes 0.04204057 0.09342350
## 11 Faecalicoccus pleomorphus 0.02917639 0.08704976
## 12 Phyllobacteriaceae 0.08753570 0.12304604
## 13 Propionibacteriaceae 0.15041696 0.16178188
## 14 Helcococcus ovis 0.13733066 0.16178188
## 15 Sporosarcina sp. 0.05414143 0.10828286
## 16 Olivibacter 0.09228453 0.12304604
## 17 Chryseobacterium sp. 0.14258569 0.16178188
## 18 Delftia sp. 0.16064938 0.16178188
## 19 Mycoplasma bovoculi M165/69 0.06573544 0.11951897
## 20 Lachnoclostridium 0.16178188 0.16178188
None of the bacteria are found differentially abundant between infected and non infected samples.
Moreover, among the selected species, those that are also found in the control samples are:
also_in_common <- intersect(selected, common_species)
also_in_common
## [1] "Leucobacter" "Gelidibacter"
## [3] "Ochrobactrum sp." "Arthrobacter sp."
## [5] "Trueperella pyogenes" "Faecalicoccus pleomorphus"
## [7] "Helcococcus ovis" "Sporosarcina sp."
## [9] "Delftia sp."
For those species the comparison of the total count and relative abundance are provided in the following boxplots:
viruses <- c("no virus", "virus")[as.numeric(group_virus[sel_samples])]
sel_ctrl <- which(condition_ctrl == "EN")
p <- list()
for (ind in seq_along(also_in_common[1:4])) {
df <- data.frame(counts = c(abundances[also_in_common[ind],sel_samples],
abundances_ctrl[also_in_common[ind],sel_ctrl]),
condition = factor(c(viruses, rep("control", length(sel_ctrl)))))
p[[ind]] <- ggplot(df, aes(x = condition, y = counts + 1, fill = condition)) +
geom_boxplot() + theme_bw(base_size = 15) + scale_y_log10() +
scale_fill_manual(values = c("green", "dodgerblue2", "darkorange")) +
ggtitle(selected[ind]) + ylab(expression(log[10] ~ "(count + 1)"))
#print(p)
}
multiplot(plotlist = p,cols=2)
case_counts <- apply(abundances[ ,sel_samples], 2, sum)
ctrl_counts <- apply(abundances_ctrl[ ,sel_ctrl], 2, sum)
p <- list()
for (ind in seq_along(also_in_common[1:4])) {
freq_case <- abundances[also_in_common[ind],sel_samples] / case_counts
freq_ctrl <- abundances_ctrl[also_in_common[ind],sel_ctrl] / ctrl_counts
df <- data.frame(counts = c(freq_case, freq_ctrl),
condition = factor(c(viruses, rep("control", length(sel_ctrl)))))
p[[ind]] <- ggplot(df, aes(x = condition, y = counts, fill = condition)) +
geom_boxplot() + theme_bw(base_size = 15) +
scale_fill_manual(values = c("green", "dodgerblue2", "darkorange")) +
ggtitle(selected[ind]) + ylab("relative abundance")
#print(p)
}
multiplot(plotlist = p,cols=2)
sel_samples <- which(condition == "EN")
set.seed(11)
res_plsda <- plsda(X = log(t(abundances)+1)[sel_samples, ],
Y = group_virus[sel_samples],
ncomp = nlevels(group_virus),
scale = FALSE)
## Warning in cor(A[[k]], variates.A[[k]]): l'écart type est nulle
res_perf <- perf(res_plsda, validation = 'Mfold', folds = 5,
progressBar = FALSE, nrepeat = 10)
plot(res_perf, overlay = 'measure', sd = TRUE)
plotIndiv(res_plsda , comp = c(1, 2), ind.names = FALSE, ellipse = TRUE,
legend = TRUE, title = 'PLS-DA (RSV, PI.3 and Coronavirus) - EN')
set.seed(33)
res_plsda <- tune.splsda(X = log(t(abundances)+1)[sel_samples, ],
Y = group_virus[sel_samples],
ncomp = nlevels(group_virus),
test.keepX = 1:20, validation = 'Mfold',
folds = 5, dist = 'mahalanobis.dist', nrepeat = 10,
progressBar = FALSE, scale = FALSE)
plot(res_plsda)
sel_keepX <- res_plsda$choice.keepX[1:2]
sel_keepX
## comp1 comp2
## 6 2
res_splsda <- splsda(X = log(t(abundances)+1)[sel_samples, ],
Y = group_virus[sel_samples], ncomp = nlevels(group_virus),
keepX = sel_keepX, scale = FALSE)
## Warning in cor(A[[k]], variates.A[[k]]): l'écart type est nulle
plotIndiv(res_splsda, comp = c(1,2), ind.names = FALSE, ellipse = TRUE,
legend = TRUE, title = 'sPLS-DA (RSV, PI.3 and Coronavirus) - EN')
selectVar(res_splsda, comp = 1)$name
## [1] "Moraxella lacunata" "Trueperella pyogenes"
## [3] "Ureaplasma" "Corynebacterium confusum"
## [5] "Gracilibacteria" "Leptotrichiaceae"
plotLoadings(res_splsda, comp = 1, method = 'mean', contrib = 'max',
size.title = 1)
selected <- selectVar(res_splsda, comp = 1)$name
for (ind in seq_along(selected)) {
df <- data.frame(counts = abundances[selected[ind],sel_samples],
condition = group_virus[sel_samples])
p <- ggplot(df, aes(x = condition, y = counts + 1, fill = condition)) +
geom_boxplot() + theme_bw() + scale_y_log10() +
scale_fill_manual(values = c("dodgerblue2","darkorange")) +
ggtitle(selected[ind]) + ylab(expression(log[10] ~ "(count + 1)"))
print(p)
}
all_pvals <- sapply(seq_along(selected), function(ind) {
all_data <- log(abundances[selected[ind],sel_samples] + 1)
pval <- t.test(all_data ~ group_virus[sel_samples])$p.value
})
res <- data.frame(bacteria = selected, pvalue = all_pvals,
FDR = p.adjust(all_pvals, method = "BH"))
res
## bacteria pvalue FDR
## 1 Moraxella lacunata 0.01911170 0.06091245
## 2 Trueperella pyogenes 0.04204057 0.06306086
## 3 Ureaplasma 0.13259862 0.14147549
## 4 Corynebacterium confusum 0.03045623 0.06091245
## 5 Gracilibacteria 0.02123296 0.06091245
## 6 Leptotrichiaceae 0.14147549 0.14147549
sel_samples <- which(condition == "LBA")
set.seed(11)
res_plsda <- plsda(X = log(t(abundances)+1)[sel_samples, ],
Y = group_virus[sel_samples],
ncomp = nlevels(group_virus))
## Warning in cor(A[[k]], variates.A[[k]]): l'écart type est nulle
res_perf <- perf(res_plsda, validation = 'Mfold', folds = 5,
progressBar = FALSE, nrepeat = 10)
plot(res_perf, overlay = 'measure', sd = TRUE)
plotIndiv(res_plsda , comp = c(1, 2), ind.names = FALSE, ellipse = TRUE,
legend = TRUE, title = 'PLS-DA (RSV, PI.3 and Coronavirus) - EN',
size.title = rel(2),size.xlabel = rel(1.5),size.ylabel = rel(1.5),
size.legend = rel(1))
set.seed(33)
res_plsda <- tune.splsda(X = log(t(abundances)+1)[sel_samples, ],
Y = group_virus[sel_samples],
ncomp = nlevels(group_virus),
test.keepX = 1:20, validation = 'Mfold',
folds = 5, dist = 'mahalanobis.dist', nrepeat = 10,
progressBar = FALSE)
plot(res_plsda)
sel_keepX <- res_plsda$choice.keepX[1:2]
sel_keepX
## comp1 comp2
## 15 14
res_splsda <- splsda(X = log(t(abundances)+1)[sel_samples, ],
Y = group_virus[sel_samples], ncomp = nlevels(group_virus),
keepX = c(20,20)) ## using results from "nrepeat=100"
## Warning in cor(A[[k]], variates.A[[k]]): l'écart type est nulle
plotIndiv(res_splsda, comp = c(1,2), ind.names = FALSE, ellipse = TRUE,
legend = TRUE, title = 'sPLS-DA (RSV, PI.3 and Coronavirus) - EN',
size.title = rel(2),size.xlabel = rel(1.5),size.ylabel = rel(1.5),
size.legend = rel(1))
selectVar(res_splsda, comp = 1)$name
## [1] "Brachybacterium sp." "Jeotgalicoccus psychrophilus"
## [3] "Psychrobacter pulmonis" "Paracoccus alcaliphilus"
## [5] "Staphylococcus arlettae CVD059" "Ruminococcaceae UCG-005"
## [7] "Curtobacterium flaccumfaciens" "Lachnospiraceae NK3A20 group"
## [9] "Jeotgalicoccus" "Delftia sp."
## [11] "Mycoplasma bovis" "Sphingomonas sp."
## [13] "Facklamia" "Comamonas"
## [15] "Trichococcus" "Helcococcus ovis"
## [17] "Nocardiopsis" "Leptotrichiaceae"
## [19] "Thauera" "Desemzia incerta"
plotLoadings(res_splsda, comp = 1, method = 'mean', contrib = 'max',
size.title = rel(1.5),size.axis = rel(1.5),size.legend = rel(1))
selected <- selectVar(res_splsda, comp = 1)$name
p <- list()
for (ind in seq_along(selected[1:10])) {
df <- data.frame(counts = abundances[selected[ind],sel_samples],
condition = group_virus[sel_samples])
p[[ind]] <- ggplot(df, aes(x = condition, y = counts + 1, fill = condition)) +
geom_boxplot() + theme_bw() + scale_y_log10() +
scale_fill_manual(values = c("dodgerblue2","darkorange")) +
ggtitle(selected[ind]) + ylab(expression(log[10] ~ "(count + 1)"))
#print(p)
}
multiplot(plotlist=p,cols=2)
all_pvals <- sapply(seq_along(selected), function(ind) {
all_data <- log(abundances[selected[ind],sel_samples] + 1)
pval <- t.test(all_data ~ group_virus[sel_samples])$p.value
})
res <- data.frame(bacteria = selected, pvalue = all_pvals,
FDR = p.adjust(all_pvals, method = "BH"))
res
## bacteria pvalue FDR
## 1 Brachybacterium sp. 0.05052448 0.1447919
## 2 Jeotgalicoccus psychrophilus 0.04775828 0.1447919
## 3 Psychrobacter pulmonis 0.04029216 0.1447919
## 4 Paracoccus alcaliphilus 0.06314502 0.1447919
## 5 Staphylococcus arlettae CVD059 0.09985810 0.1521428
## 6 Ruminococcaceae UCG-005 0.10649995 0.1521428
## 7 Curtobacterium flaccumfaciens 0.02126597 0.1447919
## 8 Lachnospiraceae NK3A20 group 0.06462858 0.1447919
## 9 Jeotgalicoccus 0.08698581 0.1521428
## 10 Delftia sp. 0.07341817 0.1468363
## 11 Mycoplasma bovis 0.13123811 0.1543978
## 12 Sphingomonas sp. 0.02971395 0.1447919
## 13 Facklamia 0.10558693 0.1521428
## 14 Comamonas 0.12787987 0.1543978
## 15 Trichococcus 0.12570172 0.1543978
## 16 Helcococcus ovis 0.14864534 0.1588076
## 17 Nocardiopsis 0.03295577 0.1447919
## 18 Leptotrichiaceae 0.06515637 0.1447919
## 19 Thauera 0.18595844 0.1859584
## 20 Desemzia incerta 0.15086719 0.1588076
None of the bacteria are found differentially abundant between infected and non infected samples.
Moreover, among the selected species, those that are also found in the control samples are:
also_in_common <- intersect(selected, common_species)
also_in_common
## [1] "Brachybacterium sp." "Paracoccus alcaliphilus"
## [3] "Ruminococcaceae UCG-005" "Lachnospiraceae NK3A20 group"
## [5] "Jeotgalicoccus" "Delftia sp."
## [7] "Sphingomonas sp." "Facklamia"
## [9] "Comamonas" "Trichococcus"
## [11] "Helcococcus ovis" "Thauera"
For those species the comparison of the total count and relative abundance are provided in the following boxplots:
viruses <- c("no virus", "virus")[as.numeric(group_virus[sel_samples])]
sel_ctrl <- which(condition_ctrl == "EN")
p <- list()
for (ind in seq_along(also_in_common[1:4])) {
df <- data.frame(counts = c(abundances[also_in_common[ind],sel_samples],
abundances_ctrl[also_in_common[ind],sel_ctrl]),
condition = factor(c(viruses, rep("control", length(sel_ctrl)))))
p[[ind]] <- ggplot(df, aes(x = condition, y = counts + 1, fill = condition)) +
geom_boxplot() + theme_bw(base_size = 15) + scale_y_log10() +
scale_fill_manual(values = c("green", "dodgerblue2", "darkorange")) +
ggtitle(selected[ind]) + ylab(expression(log[10] ~ "(count + 1)"))
#print(p)
}
multiplot(plotlist = p,cols=2)
case_counts <- apply(abundances[ ,sel_samples], 2, sum)
ctrl_counts <- apply(abundances_ctrl[ ,sel_ctrl], 2, sum)
p <- list()
for (ind in seq_along(also_in_common[1:4])) {
freq_case <- abundances[also_in_common[ind],sel_samples] / case_counts
freq_ctrl <- abundances_ctrl[also_in_common[ind],sel_ctrl] / ctrl_counts
df <- data.frame(counts = c(freq_case, freq_ctrl),
condition = factor(c(viruses, rep("control", length(sel_ctrl)))))
p[[ind]] <- ggplot(df, aes(x = condition, y = counts, fill = condition)) +
geom_boxplot() + theme_bw(base_size = 15) +
scale_fill_manual(values = c("green", "dodgerblue2", "darkorange")) +
ggtitle(selected[ind]) + ylab("relative abundance")
#print(p)
}
multiplot(plotlist = p,cols=2)
data_commun_sick <- abundances[which(rownames(abundances)%in%rownames(abundances_ctrl)),]
data_commun_ctrl <- abundances_ctrl[which(rownames(abundances_ctrl)%in%rownames(abundances)),]
data_supp <- cbind.data.frame(data_commun_sick,data_commun_ctrl)
log_commun_sick<-log(t(data_commun_sick)+1)
log_commun_ctrl<-log(t(data_commun_ctrl)+1)
pca_raw <- pca(log_commun_sick, ncomp = ncol(data_commun_sick),
logratio = 'none',scale=TRUE)
Adding supplementary individuals
plotIndiv(pca_raw,
comp = c(1,2),
pch = 16,
ind.names = FALSE,
group=condition,
col.per.group = color.mixo(1:2),
legend = TRUE,
style="graphics",
title="PCA for log-tranformed data",cex=1.5,size.xlabel = rel(1.5),size.legend = rel(1.5))
suppIndiv <- ((log_commun_ctrl-mean(log_commun_sick))/sd(log_commun_sick))%*%pca_raw$rotation[,1:2]
points(suppIndiv[,1],suppIndiv[,2],
col=ifelse(condition_ctrl=='EN',"dodgerblue2","darkorange"),pch=1,cex=1.5)
indiv <- scale(log_commun_sick)%*%pca_raw$rotation[,1:2]
plot(indiv[,1],indiv[,2],col=ifelse(clean_condition=='EN',"dodgerblue2","darkorange"),pch=16,main="PCA")
points(suppIndiv[,1],suppIndiv[,2],
col=ifelse(condition_ctrl=='EN',"dodgerblue2","darkorange"),pch=1)
sel_samples <- which(condition == "EN")
set.seed(11)
res_plsda <- plsda(X = log_commun_sick[sel_samples, ],
Y = group_virus[sel_samples],
ncomp = nlevels(group_virus),
scale = TRUE)
res_perf <- perf(res_plsda, validation = 'Mfold', folds = 5,
progressBar = FALSE, nrepeat = 10)
plot(res_perf, overlay = 'measure', sd = TRUE)
plotIndiv(res_plsda , comp = c(1, 2), ind.names = FALSE, ellipse = FALSE,
legend = TRUE,
title = 'PLS-DA (RSV, PI.3 and Coronavirus) - EN',style="graphics")
suppIndiv <- ((log_commun_ctrl[which(condition_ctrl == "EN"),]
-mean(log_commun_sick[sel_samples,]))/sd(log_commun_sick[sel_samples,]))%*%res_plsda$loadings$X
points(suppIndiv[,1],suppIndiv[,2],pch=17)
indiv=scale(log_commun_sick[sel_samples,])%*%res_plsda$loadings$X
plot(indiv[,1],indiv[,2],col=ifelse(group_virus[sel_samples]=='0',"dodgerblue2","darkorange"),pch=16)
points(suppIndiv[,1],suppIndiv[,2],pch=17)
set.seed(33)
res_plsda <- tune.splsda(log_commun_sick[id_abundances != "29",], clean_condition,
ncomp = nlevels(clean_condition),
multilevel = clean_id,
test.keepX = 1:20, validation = 'Mfold', folds = 10,
dist = 'mahalanobis.dist', nrepeat = 10,
progressBar = FALSE,scale=FALSE) #ajout de scale=FALSE sinon résultats incohérents
## Splitting the variation for 1 level factor.
plot(res_plsda)
sel_keepX <- res_plsda$choice.keepX[1:2]
sel_keepX
## comp1 comp2
## 7 11
res_splsda <- splsda(log_commun_sick[id_abundances != "29",], clean_condition,
ncomp = nlevels(clean_condition), multilevel = clean_id,
keepX = sel_keepX, scale=FALSE)
## Splitting the variation for 1 level factor.
plot('n',xlim=c(-4,9),ylim=c(-5,5))
## Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduits lors de la
## conversion automatique
plotIndiv(res_splsda, comp = c(1,2), ind.names = FALSE, ellipse = TRUE,
legend = TRUE, title = 'sPLS-DA Comp 1 - 2',style="graphics")
suppIndiv <- log_commun_ctrl%*%res_splsda$loadings$X
chose = (log_commun_sick[id_abundances!="29", ]) %*% res_splsda$loadings$X
points(suppIndiv[,1],suppIndiv[,2],col=ifelse(condition_ctrl=='EN',"dodgerblue2","darkorange"),pch=16,cex=1.5)
plotLoadings(res_splsda, comp = 1, method = 'mean', contrib = 'max',size.title = 1)
truc = cbind(data_commun_ctrl, data_commun_sick)
truc = data.frame(t(truc), cond = paste0(c(condition_ctrl,condition), c(rep("C", 12), rep("S", 45))))
boxplot(log(truc$Streptococcus.pluranimalium + 1) ~ truc$cond)
boxplot(log(truc$Pseudomonas.sp. + 1) ~ truc$cond)
boxplot(log(truc$Mannheimia.sp. + 1) ~ truc$cond)
EN conditionsel_samples <- which(condition == "EN")
set.seed(1205)
spls <- spls(X = log(t(abundances)+1)[sel_samples, ],
sapply(pathogenes[sel_samples, ], as.numeric) - 1, ncomp = 2,
mode = "regression", keepX = c(10,10), keepY = c(7,7))
## Warning in cor(A[[k]], variates.A[[k]]): l'écart type est nulle
plotIndiv(spls, comp = 1:2, rep.space= 'XY-variate',
ind.names = id_abundances[sel_samples], legend = F,
title = 'sPLS comp 1 - 2, XY-space',size.title = rel(2),
size.ylabel = rel(1.5), size.xlabel = rel(1.5),cex=5)
plotVar(spls, comp = 1:2, var.names = list(X.label = TRUE, Y.label = TRUE),
cex = c(4, 6))
LBA conditionsel_samples <- which(condition == "LBA")
set.seed(1205)
spls <- spls(X = log(t(abundances)+1)[sel_samples, ],
sapply(pathogenes[sel_samples, ], as.numeric) - 1, ncomp = 2,
mode = "regression", keepX = c(10,10), keepY = c(7,7))
## Warning in cor(A[[k]], variates.A[[k]]): l'écart type est nulle
plotIndiv(spls, comp = 1:2, rep.space= 'XY-variate',
ind.names = id_abundances[sel_samples], legend = F,
title = 'sPLS comp 1 - 2, XY-space',size.title = rel(2),
size.ylabel = rel(1.5), size.xlabel = rel(1.5),cex=5)
plotVar(spls, comp = 1:2, var.names = list(X.label = TRUE, Y.label = TRUE),
cex = c(4, 6))